#Library
#1. a. Which variables are continuous/numerical? Which are ordinal? Which are nominal?
#b. What are the methods for transforming categorical variables?
#c. Carry out and demonstrate data transformation where necessary.
setwd("C:/Users/Admin/Documents/Predictive Anal ASM1")
housingvaluation <- read.csv("HousingValuation.csv")
summary(housingvaluation)
Id LotArea LotShape LandContour Utilities LotConfig
Min. : 1.0 Min. : 1300 Min. :1.000 Length:1454 Length:1454 Length:1454
1st Qu.: 365.2 1st Qu.: 7544 1st Qu.:3.000 Class :character Class :character Class :character
Median : 732.5 Median : 9478 Median :4.000 Mode :character Mode :character Mode :character
Mean : 731.3 Mean : 10521 Mean :3.591
3rd Qu.:1095.8 3rd Qu.: 11604 3rd Qu.:4.000
Max. :1460.0 Max. :215245 Max. :4.000
Slope DwellClass OverallQuality OverallCondition YearBuilt ExteriorCondition
Length:1454 Length:1454 Min. : 1.000 Min. :2.000 Min. :1872 Length:1454
Class :character Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 Class :character
Mode :character Mode :character Median : 6.000 Median :5.000 Median :1973 Mode :character
Mean : 6.103 Mean :5.576 Mean :1972
3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
Max. :10.000 Max. :9.000 Max. :2010
NA's :13
BasementCondition TotalBSF CentralAir LowQualFinSF LivingArea FullBath
Length:1454 Min. : 0 Length:1454 Min. : 0.000 Min. : 334 Min. :0.000
Class :character 1st Qu.: 796 Class :character 1st Qu.: 0.000 1st Qu.:1131 1st Qu.:1.000
Mode :character Median : 992 Mode :character Median : 0.000 Median :1466 Median :2.000
Mean :1058 Mean : 5.869 Mean :1517 Mean :1.566
3rd Qu.:1300 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:2.000
Max. :6110 Max. :572.000 Max. :5642 Max. :3.000
NA's :10
HalfBath BedroomAbvGr KitchenQuality KitchenAbvGr TotalRmsAbvGrd Fireplaces
Min. :0.0000 Min. :0.000 Length:1454 Min. :0.000 Min. : 2.00 Min. :0.0000
1st Qu.:0.0000 1st Qu.:2.000 Class :character 1st Qu.:1.000 1st Qu.: 5.00 1st Qu.:0.0000
Median :0.0000 Median :3.000 Mode :character Median :1.000 Median : 6.00 Median :1.0000
Mean :0.3831 Mean :2.869 Mean :1.047 Mean : 6.52 Mean :0.6142
3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.00 3rd Qu.:1.0000
Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.00 Max. :3.0000
GarageType GarageCars PavedDrive PoolArea OpenPorchSF MoSold
Length:1454 Min. :0.000 Length:1454 Min. : 0.00 Min. : 0.00 Min. : 1.000
Class :character 1st Qu.:1.000 Class :character 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 5.000
Mode :character Median :2.000 Mode :character Median : 0.00 Median : 25.00 Median : 6.000
Mean :1.771 Mean : 2.77 Mean : 46.37 Mean : 6.319
3rd Qu.:2.000 3rd Qu.: 0.00 3rd Qu.: 68.00 3rd Qu.: 8.000
Max. :4.000 Max. :738.00 Max. :547.00 Max. :12.000
YrSold SalePrice
Min. :2006 Min. : 34900
1st Qu.:2007 1st Qu.:130000
Median :2008 Median :163250
Mean :2008 Mean :181112
3rd Qu.:2009 3rd Qu.:214000
Max. :2010 Max. :755000
View(housingvaluation)
str(housingvaluation)
'data.frame': 1454 obs. of 32 variables:
$ Id : int 3 4 5 6 8 12 14 15 17 21 ...
$ LotArea : int 11250 9550 14260 14115 10382 11924 10652 10920 11241 14215 ...
$ LotShape : int 3 3 3 3 3 3 3 3 3 3 ...
$ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
$ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
$ LotConfig : chr "Inside" "Corner" "FR2" "Inside" ...
$ Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
$ DwellClass : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
$ OverallQuality : int 7 7 8 5 7 9 7 6 6 8 ...
$ OverallCondition : int 5 5 5 5 6 5 5 5 7 5 ...
$ YearBuilt : int 2001 1915 2000 1993 1973 2005 2006 1960 1970 2005 ...
$ ExteriorCondition: chr "TA" "TA" "TA" "TA" ...
$ BasementCondition: chr "TA" "Gd" "TA" "TA" ...
$ TotalBSF : int 920 756 1145 796 1107 1175 1494 1253 1004 1158 ...
$ CentralAir : chr "Y" "Y" "Y" "Y" ...
$ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
$ LivingArea : int 1786 1717 2198 1362 2090 2324 1494 1253 1004 2376 ...
$ FullBath : int 2 1 2 1 2 3 2 1 1 3 ...
$ HalfBath : int 1 0 1 1 1 0 0 1 0 1 ...
$ BedroomAbvGr : int 3 3 4 1 3 4 3 2 2 4 ...
$ KitchenQuality : chr "Gd" "Gd" "Gd" "TA" ...
$ KitchenAbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
$ TotalRmsAbvGrd : int 6 7 9 5 7 11 7 5 5 9 ...
$ Fireplaces : int 1 1 1 0 2 2 1 1 1 1 ...
$ GarageType : chr "Attchd" "Detchd" "Attchd" "Attchd" ...
$ GarageCars : int 2 3 3 2 2 3 3 1 2 3 ...
$ PavedDrive : chr "Y" "Y" "Y" "Y" ...
$ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
$ OpenPorchSF : int 42 35 84 30 204 21 33 213 0 154 ...
$ MoSold : int 9 2 12 10 11 7 8 5 3 11 ...
$ YrSold : int 2008 2006 2008 2009 2009 2006 2007 2008 2010 2006 ...
$ SalePrice : int 223500 140000 250000 143000 200000 345000 279500 157000 149000 325300 ...
#Part B-Question 1
summary(housingvaluation)
LotArea LotShape LandContour Slope OverallQuality OverallCondition
Min. : 1300 Min. :1.000 Min. :0.0000 Min. :0.00000 Min. : 1.000 Min. :2.000
1st Qu.: 7544 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 5.000 1st Qu.:5.000
Median : 9478 Median :4.000 Median :0.0000 Median :0.00000 Median : 6.000 Median :5.000
Mean : 10521 Mean :3.591 Mean :0.1843 Mean :0.06121 Mean : 6.103 Mean :5.576
3rd Qu.: 11604 3rd Qu.:4.000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.: 7.000 3rd Qu.:6.000
Max. :215245 Max. :4.000 Max. :3.0000 Max. :2.00000 Max. :10.000 Max. :9.000
YearBuilt ExteriorCondition BasementCondition TotalBSF CentralAir LowQualFinSF
Min. :1872 Min. :0.000 Min. :0.000 Min. : 0 Min. :0.0000 Min. : 0.000
1st Qu.:1954 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 796 1st Qu.:1.0000 1st Qu.: 0.000
Median :1973 Median :1.000 Median :2.000 Median : 992 Median :1.0000 Median : 0.000
Mean :1972 Mean :1.083 Mean :1.963 Mean :1058 Mean :0.9354 Mean : 5.869
3rd Qu.:2000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1300 3rd Qu.:1.0000 3rd Qu.: 0.000
Max. :2010 Max. :2.000 Max. :3.000 Max. :6110 Max. :1.0000 Max. :572.000
NA's :13
LivingArea FullBath HalfBath BedroomAbvGr KitchenQuality KitchenAbvGr
Min. : 334 Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.:1131 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
Median :1466 Median :2.000 Median :0.0000 Median :3.000 Median :1.000 Median :1.000
Mean :1517 Mean :1.566 Mean :0.3831 Mean :2.869 Mean :1.514 Mean :1.047
3rd Qu.:1777 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.000
Max. :5642 Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000 Max. :3.000
NA's :10
TotalRmsAbvGrd Fireplaces GarageCars PavedDrive PoolArea OpenPorchSF
Min. : 2.00 Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 5.00 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 0.00 1st Qu.: 0.00
Median : 6.00 Median :1.0000 Median :2.000 Median :2.000 Median : 0.00 Median : 25.00
Mean : 6.52 Mean :0.6142 Mean :1.771 Mean :1.858 Mean : 2.77 Mean : 46.37
3rd Qu.: 7.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.: 0.00 3rd Qu.: 68.00
Max. :14.00 Max. :3.0000 Max. :4.000 Max. :2.000 Max. :738.00 Max. :547.00
MoSold YrSold SalePrice Utilities_AllPub Utilities_NoSeWa LotConfig_Corner
Min. : 1.000 Min. :2006 Min. : 34900 Min. :0.0000 Min. :0.0000000 Min. :0.0000
1st Qu.: 5.000 1st Qu.:2007 1st Qu.:130000 1st Qu.:1.0000 1st Qu.:0.0000000 1st Qu.:0.0000
Median : 6.000 Median :2008 Median :163250 Median :1.0000 Median :0.0000000 Median :0.0000
Mean : 6.319 Mean :2008 Mean :181112 Mean :0.9993 Mean :0.0006878 Mean :0.1802
3rd Qu.: 8.000 3rd Qu.:2009 3rd Qu.:214000 3rd Qu.:1.0000 3rd Qu.:0.0000000 3rd Qu.:0.0000
Max. :12.000 Max. :2010 Max. :755000 Max. :1.0000 Max. :1.0000000 Max. :1.0000
LotConfig_CulDSac LotConfig_FR2 LotConfig_FR3 LotConfig_Inside Dwellclass_Single_Family
Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:1.0000
Median :0.00000 Median :0.00000 Median :0.000000 Median :1.0000 Median :1.0000
Mean :0.06465 Mean :0.03232 Mean :0.002751 Mean :0.7201 Mean :0.8349
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.0000
Dwellclass_Two_Family Dwellclass_Duplex Dwellclass_Townhouse_EndUnit Dwellclass_Townhouse_InsideUnite
Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
Mean :0.02132 Mean :0.03576 Mean :0.0784 Mean :0.02957
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
Twotypes_garage Attachedtohome_garage Basement_garage Buildin_garage Carport_garage
Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.000000 Median :1.0000 Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.004126 Mean :0.5983 Mean :0.01307 Mean :0.06052 Mean :0.00619
3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
Detachedfromhome_garage
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.2641
3rd Qu.:1.0000
Max. :1.0000
#PartB-Question 2 & 3
housingvaluation$Dwellclass_Single_Family <- as.numeric(housingvaluation$DwellClass == "1Fam")
housingvaluation$Dwellclass_Two_Family <- as.numeric(housingvaluation$DwellClass == "2fmCon")
housingvaluation$Dwellclass_Duplex <- as.numeric(housingvaluation$DwellClass == "Duplex")
housingvaluation$Dwellclass_Townhouse_EndUnit <- as.numeric(housingvaluation$DwellClass == "TwnhsE")
housingvaluation$Dwellclass_Townhouse_InsideUnite <- as.numeric(housingvaluation$DwellClass == "Twnhs")
housingvaluation <- housingvaluation[, !names(housingvaluation) %in% c("DwellClass")]
#Handle NA values of GrarageType variable
NA_numeric <- function(x, value) {
ifelse(is.na(x), 0, as.numeric(x == value))
}
housingvaluation$Twotypes_garage <- NA_numeric(housingvaluation$GarageType, "2Types")
housingvaluation$Attachedtohome_garage <- NA_numeric(housingvaluation$GarageType, "Attchd")
housingvaluation$Basement_garage <- NA_numeric(housingvaluation$GarageType, "Basment")
housingvaluation$Buildin_garage <- NA_numeric(housingvaluation$GarageType, "BuiltIn")
housingvaluation$Carport_garage <- NA_numeric(housingvaluation$GarageType, "CarPort")
housingvaluation$Detachedfromhome_garage <- NA_numeric(housingvaluation$GarageType, "Detchd")
housingvaluation <- housingvaluation[, !names(housingvaluation) %in% c("GarageType")]
#Tranform ordinal varibles into numerical
housingvaluation$ExteriorCondition <- factor(housingvaluation$ExteriorCondition, levels=c("Gd","TA","Fa"), labels=c(2,1,0))
housingvaluation$ExteriorCondition <- as.numeric(as.character(housingvaluation$ExteriorCondition))
housingvaluation$CentralAir <- factor(housingvaluation$CentralAir, levels=c("N","Y"),
labels=c(0,1))
housingvaluation$CentralAir <- as.numeric(as.character(housingvaluation$CentralAir))
housingvaluation$BasementCondition <- factor(housingvaluation$BasementCondition, levels=c("Gd","TA","Fa","NB"),
labels=c(3,2,1,0))
housingvaluation$BasementCondition <- as.numeric(as.character(housingvaluation$BasementCondition))
housingvaluation$KitchenQuality <- factor(housingvaluation$KitchenQuality, levels=c("Ex","Gd","TA","Fa"),
labels=c(3,2,1,0))
housingvaluation$KitchenQuality <- as.numeric(as.character(housingvaluation$KitchenQuality))
housingvaluation$LandContour <- factor(housingvaluation$LandContour, levels=c("Low","HLS","Bnk","Lvl"),
labels=c(3,2,1,0))
housingvaluation$LandContour <- as.numeric(as.character(housingvaluation$LandContour))
housingvaluation$PavedDrive <- factor(housingvaluation$PavedDrive, levels=c("Y","P","N"),
labels=c(2,1,0))
housingvaluation$PavedDrive <- as.numeric(as.character(housingvaluation$PavedDrive))
housingvaluation$Slope <- factor(housingvaluation$Slope, levels=c("Sev","Mod","Gtl"),
labels=c(2,1,0))
housingvaluation$Slope <- as.numeric(as.character(housingvaluation$Slope))
#Part B - Question 2
#question 2: a. Calculate the summary statistics: mean, median, max and standard deviation for each of the continuous variables, and count for each categorical variable.
continuous_var <- c("LotArea", "TotalBSF", "LivingArea", "SalePrice", "OpenPorchSF", "LowQualFinSF","PoolArea", "GarageCars")
summary_Con_var <- summary(housingvaluation[, continuous_var])
# Standard Deviation
sd_results <- sapply(housingvaluation[, c("LotArea", "TotalBSF", "LivingArea", "SalePrice", "OpenPorchSF", "LowQualFinSF","PoolArea", "GarageCars")],
function(x) sd(x, na.rm = TRUE))
#Count of each catergorical variables
categorical_var <- subset(housingvaluation, select = -c(LotArea, TotalBSF, LivingArea, SalePrice, OpenPorchSF, LowQualFinSF, PoolArea, GarageCars))
frequency_tables <- lapply(categorical_var, table)
frequency_df <- do.call(rbind, lapply(names(frequency_tables), function(var) {
data.frame(
Variable = var,
Category = names(frequency_tables[[var]]),
Frequency = as.vector(frequency_tables[[var]])
)
}))
write.csv(frequency_df, "categorical_frequency_table.csv", row.names = FALSE)
#Check for extreme values
par(mfrow = c(3, 5), mar = c(4, 4, 2, 1))
for (var in continuous_var) {
boxplot(housingvaluation[[var]],
main = var,
ylab = var,
col = "lightblue",
outline = TRUE)}
par(mfrow = c(1, 1))
#Part B - Question 3
#Histogram
housingvaluation %>%
select(all_of(continuous_var)) %>%
gather(key = "variable", value = "value") %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram()
par(mfrow=c(3,3))
hist(housingvaluation$LotArea, breaks = 50, col="orange", main = "LotArea")
hist(housingvaluation$LivingArea, breaks = 50, col="orange", main = "LivingArea")
hist(housingvaluation$TotalBSF, breaks = 50, col="orange", main = "TotalBSF")
hist(housingvaluation$SalePrice, breaks = 50, col="orange", main = "SalePrice")
hist(housingvaluation$LowQualFinSF, breaks = 50, col="orange", main = "LowQualFinSF")
hist(housingvaluation$PoolArea, breaks = 50, col="orange", main = "PoolArea")
hist(housingvaluation$OpenPorchSF, breaks = 50, col="orange", main = "OpenPorchSF")
hist(housingvaluation$GarageCars, breaks = 50, col="orange", main = "GarageCars")
#Outlier
find_outliers <- function(x) {
x <- x[!is.na(x)]
if (length(x) == 0) return(numeric(0))
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
return(x[x < lower_bound | x > upper_bound])
}
outliers <- lapply(housingvaluation[continuous_var], function(x) {
tryCatch(
find_outliers(x),
error = function(e) {
warning(paste("Error processing variable:", deparse(substitute(x))))
return(NULL)
}
)
})
print(outliers)
#Part B - Question 4
#1 Identify missing value
missing_values <- colSums(is.na(housingvaluation))
variables_with_missing <- names(missing_values[missing_values > 0])
missing_percentages <- colMeans(is.na(housingvaluation)) * 100
print(missing_percentages[missing_percentages > 0])
summary(housingvaluation)
# Remove missing values
Remove_missing_value <- housingvaluation
all.deleted <- Remove_missing_value[complete.cases(Remove_missing_value),]
mean(all.deleted$LivingArea, na.rm = TRUE)
mean(housingvaluation$LivingArea, na.rm = TRUE)
mean(all.deleted$YearBuilt, na.rm = TRUE)
mean(housingvaluation$YearBuilt, na.rm = TRUE)
png("correlation_plot_rmmv.png", width=2500, height=2500, res=150)
pairs.panels(Replace_with_mean, col="red")
dev.off()
plot(density(all.deleted$LivingArea), col="red",
main="LivingArea Original (Blue) vs Transformed (Red)")
lines(density(Remove_missing_value$LivingArea, na.rm = TRUE), col="blue")
plot(density(all.deleted$YearBuilt), col="red",
main="YearBuilt Original (Blue) vs Transformed (Red)")
lines(density(Remove_missing_value$YearBuilt, na.rm = TRUE), col="blue")
#Replace with mean
Replace_with_mean <- housingvaluation
summary(Replace_with_mean)
Replace_with_mean$YearBuilt[is.na(Replace_with_mean$YearBuilt)] <- mean(Replace_with_mean$YearBuilt, na.rm = TRUE)
mean(Replace_with_mean$YearBuilt, na.rm = TRUE)
Replace_with_mean$LivingArea[is.na(Replace_with_mean$LivingArea)] <- mean(Replace_with_mean$LivingArea, na.rm = TRUE)
mean(Replace_with_mean$LivingArea, na.rm = TRUE)
png("correlation_plot_rpwm.png", width=2500, height=2500, res=150)
pairs.panels(Replace_with_mean, col="red")
dev.off()
plot(density(Replace_with_mean$YearBuilt), col="red",
main="YearBuilt Original (Blue) vs Transformed (Red)")
lines(density(housingvaluation$YearBuilt, na.rm = TRUE), col="blue")
plot(density(Replace_with_mean$LivingArea), col="red",
main="LivingArea Original (Blue) vs Transformed (Red)")
lines(density(housingvaluation$LivingArea, na.rm = TRUE), col="blue")
#Replace With Zero
MV_zero <- housingvaluation
MV_zero[is.na(MV_zero)] <- 0
mean(MV_zero$YearBuilt, na.rm = TRUE)
mean(housingvaluation$YearBuilt, na.rm = TRUE)
mean(MV_zero$LivingArea, na.rm = TRUE)
mean(housingvaluation$LivingArea, na.rm = TRUE)
png("correlation_plot.png", width=2500, height=2500, res=150)
pairs.panels(MV_zero, col="red")
dev.off()
plot(density(MV_zero$YearBuilt), col="red",
main="YearBuilt Original (Blue) vs Transformed (Red)")
lines(density(housingvaluation$YearBuilt, na.rm = TRUE), col="blue")
plot(density(MV_zero$LivingArea), col="red",
main="LivingArea Original (Blue) vs Transformed (Red)")
lines(density(housingvaluation$LivingArea, na.rm = TRUE), col="blue")
#final dataset
housingvaluation <- housingvaluation[, !names(housingvaluation) %in% c("Id")]
housingvaluation_complete <- housingvaluation[complete.cases(housingvaluation),]
summary(housingvaluation_complete)
#Part B - Question 5
names(housingdataset)[highlyCorr]
[1] "OverallQuality" "LivingArea" "YearBuilt"
[4] "GarageCars" "FullBath" "TotalRmsAbvGrd"
[7] "Attachedtohome_garage" "KitchenAbvGr" "Dwellclass_Single_Family"
[10] "LotConfig_Inside" "Slope" "Utilities_NoSeWa"
selected_attr <- housingvaluation_complete$SalePrice
par(mfrow=c(1,2))
hist(selected_attr, col="orange", main="Histogram")
plot(density(selected_attr, na.rm=TRUE), main="Density")
#Distribution of selected variables against the target variable
Selected_vars <- subset(housingdataset, select = -c(KitchenAbvGr, Utilities_NoSeWa, Attachedtohome_garage, MoSold, YrSold, GarageCars))
Selected_vars$SalePrice <- target_var
#Distribution of continous vars with target var
continuous_vars2 <- c("LotArea", "TotalBSF", "LivingArea", "OpenPorchSF", "LowQualFinSF", "PoolArea")
par(mfrow = c(2, 3))
plot_list <- list()
for (var in continuous_vars2) {
p <- ggplot(Selected_vars, aes_string(x = var, y = "SalePrice")) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(x = var, y = "Sale Price", title = paste("Sale Price vs", var)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(face = "bold"),
axis.text = element_text(size = 8))
plot_list[[var]] <- p
}
grid.arrange(grobs = plot_list, ncol = 3)
#Distribution of categorical vars with target var
categorical_vars2 <- Selected_vars %>%
select(-c(LotArea, TotalBSF, LivingArea, OpenPorchSF, LowQualFinSF, PoolArea, SalePrice))
plot_data <- categorical_vars2 %>%
mutate(SalePrice = Selected_vars$SalePrice) %>%
pivot_longer(cols = -SalePrice, names_to = "Variable", values_to = "Category") %>%
group_by(Variable, Category) %>%
summarise(MeanSalePrice = mean(SalePrice, na.rm = TRUE), .groups = 'drop')
plot_list <- lapply(unique(plot_data$Variable), function(var) {
ggplot(plot_data[plot_data$Variable == var,], aes(x = Category, y = MeanSalePrice)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
axis.text.y = element_text(size = 6),
plot.title = element_text(size = 8),
plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), "cm")) +
labs(title = var, x = NULL, y = NULL) +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-3, suffix = "K"))
})
n_plots <- length(plot_list)
n_cols <- ceiling(sqrt(n_plots))
n_rows <- ceiling(n_plots / n_cols)
cat_dis <- grid.arrange(
grobs = plot_list,
ncol = n_cols,
nrow = n_rows,
top = "Distribution of Categorical Variables vs SalePrice"
)
#Test for skewness
housingdataset %>%
select(all_of(continuous_vars2)) %>%
gather(key = "variable", value = "value") %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram()
categorical_var3 <- colnames(categorical_vars2)
housingdataset %>%
select(all_of(categorical_var3)) %>%
gather(key = "variable", value = "value") %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_bar()
#Tranform to normal distribution
right_skewcols <- c("LivingArea", "OpenPorchSF", "TotalBSF", "LotArea")
housingdataset[right_skewcols] <- lapply(housingdataset[right_skewcols], function(x) {
x[x <= 0] <- 0.01
log(x)
})
housingdataset %>%
select(all_of(continuous_vars2)) %>%
gather(key = "variable", value = "value") %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram()
#Part C- question 1
summary(hmodel2)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.833821e+06 187818.60432 -9.76378708 1.718300e-21
LotArea 2.486928e+04 3904.80433 6.36889185 3.007553e-10
LotShape 2.527157e+03 2517.55135 1.00381539 3.157324e-01
LandContour 1.522128e+02 2608.27264 0.05835769 9.534764e-01
Slope 7.718414e+03 5734.40777 1.34598279 1.786407e-01
OverallQuality 2.004424e+04 1634.30220 12.26470852 3.956659e-32
OverallCondition 5.575092e+03 1442.22800 3.86561101 1.186240e-04
YearBuilt 5.495736e+02 81.42688 6.74929020 2.631501e-11
ExteriorCondition -9.264129e+02 3981.01178 -0.23270791 8.160402e-01
BasementCondition -4.717354e+03 4424.54518 -1.06617819 2.866237e-01
TotalBSF 2.525382e+03 1063.61873 2.37432998 1.778582e-02
CentralAir -9.180487e+03 6083.25797 -1.50913991 1.316075e-01
LowQualFinSF -1.778281e+01 22.74613 -0.78179519 4.345368e-01
LivingArea 5.883810e+04 9466.49113 6.21540748 7.761564e-10
FullBath -1.894774e+02 3629.51900 -0.05220454 9.583771e-01
HalfBath -5.376196e+03 3016.16044 -1.78246345 7.500446e-02
BedroomAbvGr -1.128418e+04 2204.15643 -5.11950156 3.732967e-07
KitchenQuality 1.960675e+04 2701.32514 7.25819844 8.359075e-13
TotalRmsAbvGrd 7.030814e+03 1599.20751 4.39643625 1.229220e-05
Fireplaces 6.948801e+03 2265.97287 3.06658620 2.228683e-03
PavedDrive 2.103123e+02 2839.80703 0.07405866 9.409799e-01
PoolArea 3.340128e+01 28.84393 1.15800029 2.471654e-01
OpenPorchSF -2.265520e+02 331.81891 -0.68275806 4.949322e-01
Utilities_AllPub 6.545862e+04 39148.75661 1.67204843 9.485591e-02
LotConfig_Corner -2.833160e+03 3336.10807 -0.84924113 3.959688e-01
LotConfig_CulDSac 7.578925e+03 5519.00873 1.37324019 1.700134e-01
LotConfig_FR2 -4.029876e+03 6948.49180 -0.57996406 5.620813e-01
LotConfig_FR3 -9.110845e+03 21840.81657 -0.41714762 6.766681e-01
Dwellclass_Single_Family 4.519468e+03 8942.75442 0.50537765 6.134151e-01
Dwellclass_Two_Family -6.463472e+02 13161.53489 -0.04910880 9.608433e-01
Dwellclass_Duplex 5.069052e+03 11440.51551 0.44307898 6.578131e-01
Dwellclass_Townhouse_EndUnit 6.006218e+02 8655.96190 0.06938822 9.446957e-01
Twotypes_garage 2.112669e+03 26843.59405 0.07870291 9.372861e-01
Basement_garage 1.870783e+03 11068.23606 0.16902273 8.658160e-01
Buildin_garage 8.084745e+03 5608.13870 1.44160933 1.497540e-01
Carport_garage -1.296701e+04 15034.96768 -0.86245667 3.886618e-01
Detachedfromhome_garage 2.912890e+03 3356.42972 0.86785370 3.857015e-01
plot3 <- test.set3 %>%
ggplot(aes(SalePrice,predicted.SalePrice)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='red')) +
xlab('Actual value of SalePrice') +
ylab('Predicted value of SalePrice')+
theme_bw()
ggplotly(plot3)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Decision Tree
dcthousing <- housingdataset_selected1
Error: object 'housingdataset_selected1' not found
dtree$variable.importance
OverallQuality LivingArea TotalBSF KitchenQuality YearBuilt TotalRmsAbvGrd
3.943247e+12 1.054748e+12 1.043928e+12 1.038985e+12 7.239169e+11 4.288177e+11
BedroomAbvGr LotArea HalfBath FullBath Fireplaces Buildin_garage
2.170179e+11 1.509278e+11 1.429624e+11 1.346425e+11 4.652120e+10 1.924287e+10
LotConfig_Corner
1.635907e+10
print(paste("Root Mean Square Error: ", dct_rmse))
[1] "Root Mean Square Error: 43645.6040879741"
print(pruned.dtree)
n= 954
node), split, n, deviance, yval
* denotes terminal node
1) root 954 6.017664e+12 181808.0
2) OverallQuality< 7.5 801 1.867541e+12 157862.2
4) OverallQuality< 6.5 592 8.130738e+11 140674.1
8) LivingArea< 7.235619 369 3.024860e+11 125960.4
16) TotalBSF< 6.915227 242 1.488286e+11 114974.5 *
17) TotalBSF>=6.915227 127 6.879663e+10 146894.1 *
9) LivingArea>=7.235619 223 2.985157e+11 165020.8 *
5) OverallQuality>=6.5 209 3.841739e+11 206548.1
10) LivingArea< 7.507689 136 1.282820e+11 189035.2 *
11) LivingArea>=7.507689 73 1.364708e+11 239175.0 *
3) OverallQuality>=7.5 153 1.286291e+12 307171.2
6) OverallQuality< 8.5 111 4.357182e+11 275362.6
12) LivingArea< 7.564236 64 1.461172e+11 245779.4 *
13) LivingArea>=7.564236 47 1.573209e+11 315646.1 *
7) OverallQuality>=8.5 42 4.414506e+11 391236.6
14) LivingArea< 7.610357 19 1.950673e+10 332807.7 *
15) LivingArea>=7.610357 23 3.034949e+11 439504.0 *